In this document, we will try to understand the thresholded cricketer scores.
The hypothesis is that, after a cricketer scores above a certain number, he will likely score more (after the initial score is subtracted).
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
clean_test_batting_inngings = readr::read_csv("./clean_test_batting_inngings.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## player_name = col_character(),
## runs = col_character(),
## mins = col_double(),
## balls_faced = col_double(),
## fours = col_double(),
## sixes = col_double(),
## strike_rate = col_double(),
## pos = col_double(),
## dismissal = col_character(),
## inns = col_double(),
## opposition = col_character(),
## ground = col_character(),
## start_date = col_character(),
## test_number = col_character()
## )
inns_data = clean_test_batting_inngings %>%
dplyr::mutate(
not_out = runs %>% stringr::str_detect("[*]") | runs %>% stringr::str_detect("DNB"),
out = !not_out,
runs_num = runs %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer() %>%
coalesce(0L)
)
# thres_data %>%
# group_by(player_name) %>%
# summarise(career_avg = sum(runs_num)/sum(out))
thres_data_30 = inns_data %>%
dplyr::mutate(
runs_num_30 = ifelse(runs_num - 30L < 0,
0, runs_num - 30L)) %>%
group_by(player_name) %>%
summarise(career_avg = sum(runs_num_30)/sum(out))
career_data = inns_data %>%
group_by(player_name) %>%
dplyr::summarise(
career_avg = sum(runs_num)/sum(out),
inns = n())
career_data_top100 = career_data %>%
dplyr::filter(inns >= 20L) %>%
dplyr::top_n(n = 20, wt = career_avg)
inns_data_top100 = inns_data %>%
dplyr::filter(player_name %in% career_data_top100$player_name)
thres_career_avg = function(data, thres){
thres_data = data %>%
dplyr::mutate(
runs_num_thres = ifelse(runs_num - thres < 0,
0, runs_num - thres)) %>%
group_by(player_name) %>%
summarise(career_avg_thres = sum(runs_num_thres)/sum(out))
return(thres_data)
}
# thres_career_avg(data = inns_data_top100, thres = 30L)
thres = 0:30
list_thres_career_avg = purrr::map(
.x = thres,
.f = ~ thres_career_avg(data = inns_data_top100, thres = .x)
)
mat_thres_career_avg = list_thres_career_avg %>%
purrr::map("career_avg_thres") %>%
do.call(cbind, .)
df_thres_career_avg = data.frame(mat_thres_career_avg) %>%
tibble::as_tibble()
colnames(df_thres_career_avg) = sprintf("%02d", thres)
df_thres_career_avg = df_thres_career_avg %>%
dplyr::mutate(player_name = list_thres_career_avg[[1]]$player_name)
df_thres_career_avg_long = df_thres_career_avg %>%
tidyr::gather(key = thres,
value = career_avg_thres,
-player_name) %>%
group_by(thres) %>%
dplyr::mutate(career_avg_thres_rank = rank(career_avg_thres))
df_thres_career_avg_long %>%
dplyr::filter(player_name != "DG Bradman") %>%
ggplot(aes(x = thres,
y = career_avg_thres,
colour = player_name)) +
geom_point() +
geom_line(aes(group = player_name)) +
theme(legend.position = "none")
# scale_colour_brewer(palette = "Set1")
df_thres_career_avg_long %>%
# dplyr::filter(player_name != "DG Bradman") %>%
ggplot(aes(x = thres,
y = career_avg_thres_rank,
colour = player_name)) +
geom_point() +
geom_line(aes(group = player_name)) +
theme(legend.position = "none")
# scale_colour_brewer(palette = "Set1")
plotly::ggplotly(last_plot())
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2
## [5] readr_1.3.1 tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.1
## [9] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.9 haven_2.1.1
## [4] lattice_0.20-38 colorspace_1.4-1 generics_0.0.2
## [7] vctrs_0.2.0 htmltools_0.3.6 viridisLite_0.3.0
## [10] yaml_2.2.0 plotly_4.9.0 rlang_0.4.0
## [13] later_0.8.0 pillar_1.4.2 glue_1.3.1
## [16] withr_2.1.2 modelr_0.1.5 readxl_1.3.1
## [19] munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0
## [22] rvest_0.3.4 htmlwidgets_1.3 evaluate_0.14
## [25] labeling_0.3 knitr_1.24 httpuv_1.5.1
## [28] crosstalk_1.0.0 Cairo_1.5-10 broom_0.5.2
## [31] Rcpp_1.0.2 xtable_1.8-4 promises_1.0.1
## [34] scales_1.0.0 backports_1.1.4 jsonlite_1.6
## [37] mime_0.7 hms_0.5.1 digest_0.6.20
## [40] stringi_1.4.3 shiny_1.3.2 grid_3.6.0
## [43] cli_1.1.0 tools_3.6.0 magrittr_1.5
## [46] lazyeval_0.2.2 crayon_1.3.4 pkgconfig_2.0.2
## [49] zeallot_0.1.0 data.table_1.12.2 xml2_1.2.2
## [52] lubridate_1.7.4 assertthat_0.2.1 rmarkdown_1.15
## [55] httr_1.4.1 rstudioapi_0.10 R6_2.4.0
## [58] nlme_3.1-141 compiler_3.6.0